library(readxl)
library(dplyr)
library(tidyr)
library(stringr)
library(tidyverse)
library(DT)
library(patchwork)
library(ggthemes)
library(plotly)
library(gapminder)
library(grid)
library(ggplot2)
library(gridExtra)
library(maps)
library(usmap)
# install.packages("DescTools")
# install.packages("car")
# install.packages("rpart")
# install.packages("rpart.plot")
library(DescTools)
library(car)
library(rpart)
library(rpart.plot)
Exploring the Impact of COVID-19 on State-Level Economic Indicators and housing price in the U.S
This project is to explore how the COVID-19 pandemic affected economic conditions, focusing on housing price across U.S. states. Although the final goal is to analyze which factors contributed to post-COVID changes in housing prices, it also contains essential works: collecting, cleaning, visualizing and statistical analysis.
Includes two datasets: one providing nationwide annual COVID case totals and another offering state-by-state case counts from 2020 to 2023. These datasets help track the spread and intensity of the pandemic across different regions and time periods, offering important context for analyzing trends.
Source: Centers for Disease Control and Prevention (CDC) (https://covid.cdc.gov/COVID-DATA-TRACKER/#datatracker-home) & USA Facts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/)
File name: cases_deaths.csv & covid_confirmed_usafacts.csv
A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.
Source: Zillow Research (https://www.zillow.com/research/data/)
File name: State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
This dataset shows monthly unemployment rates by U.S. state. It is part of the Local Area Unemployment Statistics (LAUS) program by the Bureau of Labor Statistics (BLS). The data is seasonally adjusted and helps track labor market trends at the state level over time.
Source :U.S. Bureau of Labor Statistics (https://www.bls.gov/charts/state-employment-and-unemployment/state-unemployment-rates-animated.htm)
File name: unemployment rate.xlxs
Reports annual state-level PCE data, measuring average household spending across different regions.
Source: Bureau of Economic Analysis (BEA) (https://apps.bea.gov/regional/downloadzip.htm)
File name: SAPCE1__ALL_AREAS_1997_2023.csv
Includes annual personal income per capita by state. Useful for assessing economic status and regional income disparities. Source: Bureau of Economic Analysis (BEA) File name: SAINC1__ALL_AREAS_1929_2024.csv → INCOME.csv
Provides real GDP data by state from 1997 to 2024. This helps analyze economic output and growth across regions.
Source: Bureau of Economic Analysis (BEA)
File name: SAGDP1__ALL_AREAS_1997_2024.csv
Includes per capita credit card debt and total household debt across states. Used to assess consumer financial health.
| Sheet | Title |
|---|---|
| population | Number of Consumers in New York Fed Consumer Credit Panel |
| auto loan | Auto Debt Balance per Capita |
| credit card | Credit Card Debt Balance per Capita |
| mortgage | Mortgage Debt Balance per Capita (excluding HELOC) |
| student loan | Student Loan Debt Balance per Capita |
| total | Total Debt Balance per Capita |
| auto loan delinquency | Percent of Auto Debt Balance 90+ Days Delinquent |
| credit card delinquency | Percent of Credit Card Debt Balance 90+ Days Delinquent |
| mortgage delinquency | Percent of Mortgage Debt Balance 90+ Days Delinquent |
| student loan delinquency | Percent of Student Loan Debt Balance 90+ Days Delinquent (and in default) |
Source: New York Fed (https://www.newyorkfed.org/microeconomics/databank.html) File name: area_report_by_year.xlsx → NYFED.csv
This code chunk is for cleaning COVID case.
covid <- read_csv("data/cases_deaths.csv")
#View(covid)
covid_US <- covid %>%
filter(country == "United States") %>%
select(country, date, new_cases, total_cases, weekly_cases) %>%
mutate(date = as.Date(date)) %>%
mutate(year = year(date), month = month(date, label = TRUE, abbr = FALSE)) %>%
group_by(year, month) %>%
summarise(
monthly_new_cases = sum(new_cases, na.rm = TRUE),
latest_total_cases = max(total_cases, na.rm = TRUE),
total_weekly_cases = sum(weekly_cases, na.rm = TRUE),
.groups = "drop"
)
covid_US
## # A tibble: 145 × 5
## year month monthly_new_cases latest_total_cases total_weekly_cases
## <dbl> <ord> <dbl> <dbl> <dbl>
## 1 1 January 820465 103436829 4160575
## 2 1 February 701457 103436829 4418032
## 3 1 March 531503 103436829 4672369
## 4 1 April 616176 103436829 4930142
## 5 1 May 1107849 103436829 5419154
## 6 1 June 1378542 103436829 5839205
## 7 1 July 945231 103436829 6101223
## 8 1 August 1056186 103436829 6336944
## 9 1 September 1122716 103436829 6758203
## 10 1 October 740848 103436829 6967548
## # ℹ 135 more rows
This chunk is for cleaning COVID case per state
covidbystate <-read_csv("data/covid_confirmed_usafacts.csv")
covidStatesDaily <- covidbystate %>%
group_by(State) %>%
summarize(across(where(is.numeric), sum, na.rm = TRUE))
#covidStatesDaily
covidstates <- covidStatesDaily %>%
pivot_longer(
cols = -State,
names_to = "date",
values_to = "cases"
) %>%
filter(str_detect(date, "^\\d{4}-\\d{2}-\\d{2}$")) %>%
mutate(
date = ymd(date),
year = year(date)
) %>%
group_by(State, year) %>%
summarise(total_cases = max(cases, na.rm = TRUE), .groups = "drop")
#covidstates
This chunk is to make the states categories
case_summary <- summary(covidstates$total_cases)
#case_summary
covidstates <- covidstates %>%
mutate(
severity = case_when(
total_cases <= case_summary[1] ~ "Very low",
total_cases <= case_summary[2] ~ "Low",
total_cases <= case_summary[3] ~ "Medium",
total_cases <= case_summary[5] ~ "High",
TRUE ~ "Very high"
)
)
covidstates <- covidstates %>%
mutate(severity = factor(severity, levels = c("Very low", "Low", "Medium", "High", "Very high")))
covidstates
## # A tibble: 204 × 4
## State year total_cases severity
## <chr> <dbl> <dbl> <fct>
## 1 AK 2020 46304 Low
## 2 AK 2021 149907 Low
## 3 AK 2022 277884 Low
## 4 AK 2023 287319 Low
## 5 AL 2020 361226 Medium
## 6 AL 2021 896614 High
## 7 AL 2022 1568934 High
## 8 AL 2023 1659936 High
## 9 AR 2020 225138 Low
## 10 AR 2021 562455 Medium
## # ℹ 194 more rows
This chunk is for cleaning housing data
house <- read_csv("data/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
house1 <- house %>%
rename(State = RegionName) %>%
select(-StateName, -RegionID, -SizeRank, -RegionType)
house_long <- house1 %>%
pivot_longer(cols = -State, names_to = "Date", values_to = "ZHVI")
#house_long
house_long1 <- house_long %>%
filter(str_sub(Date, 1, 4) %in% c("2017", "2018", "2019", "2020", "2021", "2022"))
#house_long1
house_long2 <- house_long1 %>%
mutate(year = as.numeric(str_sub(Date, 1, 4)))
house_avg <- house_long2 %>%
group_by(State, year) %>%
summarise(mean_zhvi = mean(ZHVI, na.rm = TRUE), .groups = "drop")
#house_avg
house2 <- house_avg %>%
pivot_wider(names_from = year, values_from = mean_zhvi)
#house2
This chunk is to calculate the formula
house3 <- house2 %>%
mutate(
zhvi_pre_avg = rowMeans(select(., `2017`, `2018`, `2019`), na.rm = TRUE),
zhvi_post_avg = rowMeans(select(., `2020`, `2021`, `2022`), na.rm = TRUE),
zhvi_pre_change = ((`2019` - `2017`) / `2017`)*100,
zhvi_post_change = ((`2022` - `2020`) / `2020`)*100
) %>%
select(State, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change)
#house3
state_codes <- read.csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")
house4 <- house3 %>%
left_join(state_codes, by = c("State" = "State"))
house_final <- house4 %>%
select(Abbreviation, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change) %>%
rename(State = Abbreviation)
house_final
## # A tibble: 51 × 5
## State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AL 148993. 192427. 9.76 28.0
## 2 AK 292351. 335028. 7.66 12.3
## 3 AZ 252522. 368418. 15.0 49.4
## 4 AR 140966. 175352. 7.83 26.1
## 5 CA 521016. 663241. 12.7 31.3
## 6 CO 375678. 486205. 12.6 32.8
## 7 CT 251600. 305414. 3.63 28.0
## 8 DE 262137. 318745. 5.58 27.9
## 9 DC 560797. 633072. 8.25 7.84
## 10 FL 228291. 308956. 13.2 45.6
## # ℹ 41 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(house_final, "data/data-clean/HOUSING.csv", row.names = FALSE)
This chunk is for cleaning unemployment rate data
unemploy <- read_excel("data/unemployment rate.xlsx", range = "A1:DR53")
names(unemploy)[-1] <- as.character(as.Date(as.numeric(names(unemploy)[-1]), origin = "1899-12-30"))
pre_df <- unemploy %>%
select(State, `2017-12-01`, `2019-12-01`) %>%
# Some is chr so we need to numeric
mutate(across(-State, ~ as.numeric(.))) %>%
mutate(unemp_pre = `2019-12-01` - `2017-12-01`) %>%
rename(unemp_19 = `2019-12-01`, unemp_17=`2017-12-01`) %>%
select(State, unemp_pre, unemp_17 ,unemp_19)
post_df <- unemploy %>%
select(State, `2020-12-01`, `2022-12-01`) %>%
# Some is chr so we need to numeric
mutate(across(-State, ~ as.numeric(.))) %>%
mutate(unemp_post = `2022-12-01` - `2020-12-01`) %>%
rename(unemp_22 = `2022-12-01`, unemp_20 = `2020-12-01`) %>%
select(State, unemp_post, unemp_20, unemp_22)
unemploy2 <- pre_df %>%
inner_join(post_df, by = "State")
unemploy3 <- unemploy2 %>%
left_join(state_codes, by = c("State" = "State"))
unemploy_final <- unemploy3 %>%
select(Abbreviation, unemp_pre,unemp_17, unemp_19, unemp_post,unemp_20, unemp_22) %>%
rename(State = Abbreviation)
unemploy_final
## # A tibble: 52 × 7
## State unemp_pre unemp_17 unemp_19 unemp_post unemp_20 unemp_22
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL -0.9 4 3.1 -2.2 4.5 2.3
## 2 AK -1.4 6.6 5.2 -3.1 7.1 4
## 3 AZ -0.200 4.9 4.7 -2.8 6.5 3.7
## 4 AR -0.300 3.8 3.5 -2 5 3
## 5 CA -0.400 4.5 4.1 -4.6 9 4.4
## 6 CO -0.100 2.9 2.8 -3.4 6.4 3
## 7 CT -0.600 4.3 3.7 -3.9 7.4 3.5
## 8 DE -0.6 4.2 3.6 -1.6 5.6 4
## 9 DC -0.5 5.9 5.4 -3.1 7.8 4.7
## 10 FL -1 4 3 -3.3 6.2 2.9
## # ℹ 42 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(unemploy_final, "data/data-clean/UNEMPLOY.csv", row.names = FALSE)
This chunk is for cleaning Personal consumption expenditures
pce <- read.csv("data/SAPCE1__ALL_AREAS_1997_2023.csv")
pce_df <- pce %>%
filter(Description == "Personal consumption expenditures ") %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
filter(state != "United States") %>%
mutate(
pce_pre = ((X2019 - X2017) / X2017) * 100,
pce_post = ((X2022 - X2020) / X2020) * 100
)
pce_final <- pce_df %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, pce_pre, pce_post) %>%
rename(State = Abbreviation)
head(pce_final)
## State pce_pre pce_post
## 1 AL 8.277837 22.66264
## 2 AK 5.572225 23.88433
## 3 AZ 10.681811 27.98044
## 4 AR 6.486729 22.09850
## 5 CA 10.787929 25.65983
## 6 CO 11.319759 27.96054
# export to file but I will # because we already have sync folder with github
#write.csv(pce_final, "data/data-clean/PCE.csv", row.names = FALSE)
This chunk is for cleaning Annual Personal Income
income <- read.csv("data/SAINC1__ALL_AREAS_1929_2024.csv")
income_df1 <- income %>%
filter(str_trim(Description) == "Per capita personal income (dollars) 2/") %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
filter(state != "United States") %>%
mutate(
income_pre = ((X2019 - X2017) / X2017) * 100,
income_post = ((X2022 - X2020) / X2020) * 100
)
income_final <- income_df1 %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, income_pre, income_post) %>%
rename(State = Abbreviation)
head(income_final)
## State income_pre income_post
## 1 AL 7.581065 12.688960
## 2 AK 7.086726 11.401225
## 3 AZ 10.317882 13.134632
## 4 AR 5.637273 17.407643
## 5 CA 10.315388 9.528295
## 6 CO 13.109126 18.359079
# export to file but I will # because we already have sync folder with github
#write.csv(income_final, "data/data-clean/INCOME.csv", row.names = FALSE)
This chunk is for cleaning GDP
gdp_df <- read.csv("data/SAGDP1__ALL_AREAS_1997_2024.csv")
gdp_real <- gdp_df %>%
filter(Description == "Real GDP (millions of chained 2017 dollars) 1/")
gdp_df2 <- gdp_real %>%
select(GeoName, X2017, X2019, X2020, X2022) %>%
rename(state = GeoName) %>%
mutate(
gdp_pre = ((X2019 - X2017) / X2017) * 100,
gdp_post = ((X2022 - X2020) / X2020) * 100
) %>%
filter(state != "United States")
gdp_final <- gdp_df2 %>%
left_join(state_codes, by = c("state" = "State")) %>%
select(Abbreviation, gdp_pre, gdp_post) %>%
rename(State = Abbreviation)
head(gdp_final)
## State gdp_pre gdp_post
## 1 AL 3.996621 7.3182724
## 2 AK -2.191186 0.8628568
## 3 AZ 7.948898 12.3828137
## 4 AR 2.694002 8.7421859
## 5 CA 8.358128 8.5462064
## 6 CO 9.570425 10.1607412
# export to file but I will # because we already have sync folder with github
#write.csv(gdp_final, "data/data-clean/GDP.csv", row.names = FALSE)
This chunk is to read NYFED data set
file_path <- "data/area_report_by_year.xlsx"
# There are many sheets so we should target it
# But auto has diffefent format so we will seperate it
target_sheets <- c(
"population",
"creditcard",
"mortgage",
"student loan",
"total",
"auto_delinq",
"creditcard_delinq",
"mortgage_delinq",
"studentloan_delinq"
)
# auto has different format so seperated
auto <- read_excel(file_path, sheet = "auto", skip = 4, col_names = TRUE)
#auto
for (sheet_name in target_sheets) {
df_name <- gsub(" ", "_", sheet_name)
# The real data start from 9 row in the xlxs file.
df <- read_excel(file_path, sheet = sheet_name, skip = 8, col_names = TRUE)
assign(df_name, df)
#print(df_name)
}
# After load the file, it's nice to check but I will make it # to make the document short.
#population
#auto
#creditcard
#mortgage
#student_loan
#total
#auto_delinq
#creditcard_delinq
#mortgage_delinq
#studentloan_delinq
This chunk is for cleaning population
#population
population_clean <- population %>%
filter(!is.na(state)) %>%
mutate(
population_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
population_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, population_pre, population_post)
# population_clean
This chunk is for cleaning credit card dept
creditcard_clean <- creditcard %>%
filter(!is.na(state)) %>%
mutate(
creditcard_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
creditcard_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, creditcard_pre, creditcard_post)
# creditcard_clean
This chunk is for cleaning auto dept
auto_clean <- auto %>%
filter(!is.na(state)) %>%
mutate(
auto_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
auto_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, auto_pre, auto_post)
# auto_clean
This chunk is for cleaning mortgage
mortgage_clean <- mortgage %>%
filter(!is.na(state)) %>%
mutate(
mortgage_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
mortgage_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, mortgage_pre, mortgage_post)
# mortgage_clean
This chunk is for cleaning student loan
studentloan_clean <- student_loan %>%
filter(!is.na(state)) %>%
mutate(
studentloan_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
studentloan_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, studentloan_pre, studentloan_post)
# studentloan_clean
This chunk is for cleaning total dept
total_clean <- total %>%
filter(!is.na(state)) %>%
mutate(
total_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
total_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
) %>%
select(state, total_pre, total_post)
# total_clean
This chunk is for cleaning creditcard delinq
creditcard_delinq_clean <- creditcard_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
creditcard_delinq_pre = Q4_2019,
creditcard_delinq_post = Q4_2022
)
# creditcard_delinq_clean
This chunk is for cleaning auto delinq
auto_delinq_clean <- auto_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
auto_delinq_pre = Q4_2019,
auto_delinq_post = Q4_2022
)
# auto_delinq_clean
This chunk is for cleaning mortgage delinq
mortgage_delinq_clean <- mortgage_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
mortgage_delinq_pre = Q4_2019,
mortgage_delinq_post = Q4_2022
)
# mortgage_delinq_clean
This chunk is for cleaning studentloan delinq
studentloan_delinq_clean <- studentloan_delinq %>%
filter(!is.na(state)) %>%
select(state, Q4_2019, Q4_2022) %>%
rename(
studentloan_delinq_pre = Q4_2019,
studentloan_delinq_post = Q4_2022
)
# studentloan_delinq_clean
This chunk is to join every data from NEFED
NYFED_df <- population_clean %>%
left_join(creditcard_clean, by = "state") %>%
left_join(auto_clean, by = "state") %>%
left_join(mortgage_clean, by = "state") %>%
left_join(studentloan_clean, by = "state") %>%
left_join(total_clean, by = "state") %>%
left_join(creditcard_delinq_clean, by = "state") %>%
left_join(auto_delinq_clean, by = "state") %>%
left_join(mortgage_delinq_clean, by = "state") %>%
left_join(studentloan_delinq_clean, by = "state")
head(NYFED_df)
## # A tibble: 6 × 21
## state population_pre population_post creditcard_pre creditcard_post auto_pre
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AK -0.444 -1.15 3.98 13.6 -0.998
## 2 AL 1.58 1.22 7.59 15.5 7.74
## 3 AR 1.53 2.31 10 16.1 5.34
## 4 AZ 4.47 4.48 10.2 16.5 9.41
## 5 CA 1.57 1.87 11.4 19.8 7.53
## 6 CO 3.31 3.61 8.22 17.6 4.75
## # ℹ 15 more variables: auto_post <dbl>, mortgage_pre <dbl>,
## # mortgage_post <dbl>, studentloan_pre <dbl>, studentloan_post <dbl>,
## # total_pre <dbl>, total_post <dbl>, creditcard_delinq_pre <dbl>,
## # creditcard_delinq_post <dbl>, auto_delinq_pre <dbl>,
## # auto_delinq_post <dbl>, mortgage_delinq_pre <dbl>,
## # mortgage_delinq_post <dbl>, studentloan_delinq_pre <dbl>,
## # studentloan_delinq_post <dbl>
# Export final result if needed
#write.csv(NYFED_df, "data/data-clean/NYFED.csv", row.names = FALSE)
This chunk is to join every data
clean_gdp <- read_csv("data/data-clean/GDP.csv")
clean_income <- read_csv("data/data-clean/INCOME.csv")
clean_nyfed <- read_csv("data/data-clean/NYFED.csv")
clean_pce <- read_csv("data/data-clean/PCE.csv")
clean_unemploy <- read_csv("data/data-clean/UNEMPLOY.csv")
clean_housing<- read_csv("data/data-clean/HOUSING.csv")
#clean_gdp
#clean_income
#clean_nyfed
#clean_pce
#clean_unemploy
clean_nyfed <- clean_nyfed %>%
rename(State = state)
# Find common states
common_states <- Reduce(intersect, list(
clean_gdp$State,
clean_income$State,
clean_pce$State,
clean_nyfed$State,
clean_unemploy$State,
clean_housing$State
))
# Apply only common state
clean_gdp <- clean_gdp %>% filter(State %in% common_states)
clean_income <- clean_income %>% filter(State %in% common_states)
clean_pce <- clean_pce %>% filter(State %in% common_states)
clean_nyfed <- clean_nyfed %>% filter(State %in% common_states)
clean_unemploy <- clean_unemploy %>% filter(State %in% common_states)
clean_housing <- clean_housing %>% filter(State %in% common_states)
# Merge to one data set
merged_df <- clean_housing %>%
left_join(clean_gdp, by = "State") %>%
left_join(clean_income, by = "State") %>%
left_join(clean_pce, by = "State") %>%
left_join(clean_nyfed, by = "State") %>%
left_join(clean_unemploy, by = "State")
merged_df
## # A tibble: 51 × 37
## State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change gdp_pre
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AL 148993. 192427. 9.76 28.0 4.00
## 2 AK 292351. 335028. 7.66 12.3 -2.19
## 3 AZ 252522. 368418. 15.0 49.4 7.95
## 4 AR 140966. 175352. 7.83 26.1 2.69
## 5 CA 521016. 663241. 12.7 31.3 8.36
## 6 CO 375678. 486205. 12.6 32.8 9.57
## 7 CT 251600. 305414. 3.63 28.0 0.493
## 8 DE 262137. 318745. 5.58 27.9 7.29
## 9 DC 560797. 633072. 8.25 7.84 3.43
## 10 FL 228291. 308956. 13.2 45.6 6.90
## # ℹ 41 more rows
## # ℹ 31 more variables: gdp_post <dbl>, income_pre <dbl>, income_post <dbl>,
## # pce_pre <dbl>, pce_post <dbl>, population_pre <dbl>, population_post <dbl>,
## # creditcard_pre <dbl>, creditcard_post <dbl>, auto_pre <dbl>,
## # auto_post <dbl>, mortgage_pre <dbl>, mortgage_post <dbl>,
## # studentloan_pre <dbl>, studentloan_post <dbl>, total_pre <dbl>,
## # total_post <dbl>, creditcard_delinq_pre <dbl>, …
# export to file but I will # because we already have sync folder with github
# write.csv(merged_df, "data/data-clean/FINAL.csv", row.names = FALSE)
This chunk is to read final version data set
df<- read_csv("data/data-clean/FINAL.csv")
#df
#ggplot(covidstates, aes(x = severity, fill = severity)) +
# geom_bar() +
# labs(title = "Number of States by COVID Severity Level",
# x = "Severity Level", y = "Number of States") +
# scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = #"red", "Very high" = "brown")) +
# theme_minimal()
ggplot(covidstates, aes(x = severity, y = total_cases, fill = severity)) +
geom_boxplot() +
labs(title = "Distribution of Total Cases by Severity Level",
x = "Severity", y = "Total Cases") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) + theme_minimal()
covidstates %>%
ggplot(aes(x = year, fill = severity)) +
geom_bar(position = "fill") +
labs(title = "Distribution of COVID Severity by Year",
y = "Proportion of States", x = "Year") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) +
theme_minimal()
covidstates %>%
filter(State %in% c("CA", "TX", "NY", "IL", "AZ", "FL")) %>%
ggplot(aes(x = year, y = total_cases, color = State)) +
geom_line(size = 1.2) +
labs(title = "COVID Cases Over Time",
x = "Year", y = "Total Cases") +
theme_minimal()
state_abbreviations <- data.frame(
abbreviation = state.abb,
full_name = state.name
)
covidstatesFULLNAME <- covidstates %>%
left_join(state_abbreviations, by = c("State" = "abbreviation")) %>%
mutate(State = tolower(full_name))%>%
select(-full_name)
# map visualization code
us_states <- map_data("state")
us_states$region <- tolower(us_states$region)
# Merge the map data with the COVID data
covid_map_data <- us_states %>%
left_join(covidstatesFULLNAME, by = c("region" = "State")) %>%
filter(!is.na(severity))
ggplot(covid_map_data, aes(x = long, y = lat, group = group, fill = severity)) +
geom_polygon(color = "black") +
coord_fixed(1.1) +
labs(title = "COVID Case Severity by State",
fill = "Severity") +
scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high"= "brown")) +
theme_void() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# covidstatesFULLNAME
# covidstates
| Variable Name | Formula | Description |
|---|---|---|
| population_pre | ((Population_2019 - Population_2017) / Population_2017) × 100 | Population growth rate (Pre-COVID) |
| population_post | ((Population_2022 - Population_2020) / Population_2020) × 100 | Population growth rate (Post-COVID) |
merged_dfLC <- merged_df %>%
rename(state = State)
## PRE covid populations
plot_usmap(data = merged_dfLC, values = "population_pre", regions = "state") +
scale_fill_continuous(
name = "Population",
low = "lightgreen", high = "darkgreen", label = scales::comma
) +
labs(title = "Pre-COVID Population (2017–2019)") +
theme(legend.position = "right")
## POST covid populations
plot_usmap(data = merged_dfLC, values = "population_post", regions = "state") +
scale_fill_continuous(
name = "Population",
low = "lightblue", high = "darkblue", label = scales::comma
) +
labs(title = "Post-COVID Population (2020–2022)") +
theme(legend.position = "right")
### pre and post side by side
population_long <- merged_dfLC %>%
select(state, population_pre, population_post) %>%
pivot_longer(
cols = c(population_pre, population_post),
names_to = "period",
values_to = "population"
) %>%
mutate(
period = dplyr::recode(period,
"population_pre" = "Pre-COVID (2017–2019)",
"population_post" = "Post-COVID (2020–2022)"),
period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)"))
)
plot_usmap(data = population_long, values = "population", regions = "state") +
facet_wrap(~ period) +
scale_fill_continuous(
name = "Population",
low = "lightgreen", high = "darkgreen", label = scales::comma
) +
labs(title = "Population: Pre vs. Post COVID") +
theme(legend.position = "right")
| Variable Name | Formula | Description |
|---|---|---|
| gdp_pre | ((GDP_2019 - GDP_2017) / GDP_2017) × 100 | GDP growth rate (Pre-COVID) |
| gdp_post | ((GDP_2022 - GDP_2020) / GDP_2020) × 100 | GDP growth rate (Post-COVID) |
This graph compares how much GDP changed in each U.S. state before and after COVID-19.
The red bars show GDP growth after COVID (from 2020 to 2022).
The blue bars show GDP growth before COVID (from 2017 to 2019).
We can see that most states had higher growth after COVID (red bars are taller). For example, Florida (FL) had very strong growth after COVID, while North Dakota (ND) had negative growth.
# compare GDP
df %>%
select(State, gdp_pre, gdp_post) %>%
ggplot()+
geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_pre
, fill = "GDP pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4)+
geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_post, fill ="GDP post = ((X2022 - X2020) / X2020) * 100")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4)+
labs(title = "Real GDP Growth: Pre vs Post COVID", y = "GDP Change (%)", x = "")+
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula | Description |
|---|---|---|
| income_pre | ((Income_2019 - Income_2017) / Income_2017) × 100 | Personal income growth (Pre-COVID) |
| income_post | ((Income_2022 - Income_2020) / Income_2020) × 100 | Personal income growth (Post-COVID) |
This graph compares how much people’s income changed in each U.S. state before and after COVID-19.
The red bars show income growth after COVID (from 2020 to 2022).
The blue bars show income growth before COVID (from 2017 to 2019).
This chart shows that in many states, income grew more after COVID than before. Colorado (CO) and New Mexico (NM) had high income growth after COVID. In contrast, California (CA) and New York (NY) had smaller increases. After COVID they are hired again, that can be one of reason.
df %>%
select(State, income_pre, income_post) %>%
ggplot() +
geom_bar(mapping = aes(x = reorder(State, -income_post)
, y = income_pre
, fill = "Income pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(mapping = aes(x = reorder(State, -income_post)
, y = income_post
, fill = "Income post = ((X2022 - X2020) / X2020) * 100")
,stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Personal Income Growth: Pre vs Post COVID", y = "Income Change (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula | Description |
|---|---|---|
| pce_pre | ((PCE_2019 - PCE_2017) / PCE_2017) × 100 | PCE growth rate (Pre-COVID) |
| pce_post | ((PCE_2022 - PCE_2020) / PCE_2020) × 100 | PCE growth rate (Post-COVID) |
This graph shows how much people spent money in each U.S. state before and after COVID-19.
The red bars show spending growth after COVID (2020–2022).
The blue bars show spending growth before COVID (2017–2019).
When we compare the overall pattern, the difference between pre and post COVID is not very big in most states. People may have continued spending money on basic needs even during hard times.
df %>%
select(State, pce_pre, pce_post) %>%
ggplot() +
geom_bar(mapping = aes(x = reorder(State, -pce_post)
, y = pce_pre
, fill = "PCE pre = ((X2019 - X2017) / X2017) * 100")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(mapping = aes(x = reorder(State, -pce_post)
, y = pce_post
, fill = "PCE post = ((X2022 - X2020) / X2020) * 100")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Personal Consumption Expenditures Growth: Pre vs Post COVID",
y = "PCE Change (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
| Variable Name | Formula | Description |
|---|---|---|
| unemp_pre | Unemp_2019 - Unemp_2017 | Change in unemployment rate (Pre-COVID) |
| unemp_post | Unemp_2022 - Unemp_2020 | Change in unemployment rate (Post-COVID) |
This graph shows how the unemployment rate changed in each U.S. state before and after COVID-19.
The blue bars show the change from 2017 to 2019.
The red bars show the change from 2020 to 2022.
Most red bars are below zero(negative), which means unemployment went down after COVID.This is a good, more people were able to find jobs by 2022.
df %>%
select(State, unemp_pre, unemp_post) %>%
ggplot() +
geom_bar(aes(x = reorder(State, -unemp_post)
, y = unemp_pre,
fill = "Unemp pre = 2019 - 2017")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(aes(x = reorder(State, -unemp_post)
, y = unemp_post
, fill = "Unemp post = (2022 - 2020)")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Unemployment Rate Change: 2017–2019 vs 2020–2022", y = "Change in Unemployment Rate", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")
This line graph shows the unemployment rate in each U.S. state from 2017
to 2022.
In 2020, unemployment suddenly went up in almost all states. This happened because of the COVID-19 pandemic. After 2020, the lines go down again. This means that the economy started to recover and people found jobs again.
df_emp <- df %>%
select(State, unemp_17, unemp_19, unemp_20, unemp_22)
#df_emp
df_long <- df_emp %>%
pivot_longer(cols = starts_with("unemp_"), names_to = "Year", values_to = "Unemployment") %>%
mutate(Year = dplyr::recode(Year,
"unemp_17" = "2017",
"unemp_19" = "2019",
"unemp_20" = "2020",
"unemp_22" = "2022"))
#df_long
ggplot(df_long, aes(x = Year, y = Unemployment, group = State, color = State)) +
geom_line(size = 1) +
labs(title = "Unemployment Rate by State", y = "Unemployment Rate (%)", x = "Year") +
theme(legend.position = "none")
Interestingly, while credit card and car-related debt increased, other types of debt like student loans and mortgages actually went down.
This suggests that government programs or debt relief policies may have helped reduce those types of loans. But personal spending and borrowing still increased for many individuals.
| Variable Name | Formula | Description |
|---|---|---|
| creditcard_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Credit card debt growth rate (Pre-COVID) |
| creditcard_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Credit card debt growth rate (Post-COVID) |
| creditcard_delinq_pre | Q4_2019 | Credit card delinquency rate (Pre-COVID) |
| creditcard_delinq_post | Q4_2022 | Credit card delinquency rate (Post-COVID) |
This chart shows how credit card debt changed before and after COVID-19 in each U.S. state.
The red bars show the growth in credit card debt before COVID.
The blue bars show the growth after COVID.
In almost state, credit card debt grew more after COVID than before. This means many people used credit cards more
credit_df <- df %>%
select(State, creditcard_pre, creditcard_post) %>%
pivot_longer(cols = c(creditcard_pre, creditcard_post),
names_to = "Time", values_to = "Change") %>%
mutate(Time = factor(Time, levels = c("creditcard_pre", "creditcard_post")))
## credit_df
ggplot(credit_df, aes(x = State, y = Change, fill = Time)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.9) +
labs(title = "Change Rate in Credit Card Dept",
y = "Credit Card Dept Change Rate (%)", x = "")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
legend.position = "bottom")
These violin plots show how credit card debt and delinquency rates changed before and after COVID-19.
Left chart: - The left shape (red) shows credit card debt changes before COVID.
Right chart: - This chart shows how often people couldn’t pay back their credit card bills on time.
credit_delinq_df <- df %>%
select(State, creditcard_delinq_pre, creditcard_delinq_post) %>%
pivot_longer(cols = c(creditcard_delinq_pre, creditcard_delinq_post),
names_to = "Time", values_to = "Change") %>%
mutate(Time = factor(Time, levels = c("creditcard_delinq_pre", "creditcard_delinq_post")))
#credit_delinq_df
gg_debt <- ggplot(credit_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.5) +
labs(title = "Credit Card Debt Change (%)", y = "Change Rate (%)", x = "")+
theme(legend.position = "bottom")
#gg_debt
gg_delinq <- ggplot(credit_delinq_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.5) +
labs(title = "Credit Card Delinquency Rate (%)", y = "Delinquency Rate (%)", x = "")+
theme(legend.position = "bottom")
#gg_delinq
grid.arrange(gg_debt, gg_delinq, ncol = 2)
| Variable Name | Formula | Description |
|---|---|---|
| mortgage_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Mortgage debt growth rate (Pre-COVID) |
| mortgage_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Mortgage debt growth rate (Post-COVID) |
| mortgage_delinq_pre | Q4_2019 | Mortgage delinquency rate (Pre-COVID) |
| mortgage_delinq_post | Q4_2022 | Mortgage delinquency rate (Post-COVID) |
These boxplots show how mortgage debt and delinquency rates changed before and after COVID-19.
Left chart: This means that mortgage debt grew more slowly after COVID.
Right chart:This means fewer people failed to pay their mortgage after COVID.
Even though many people had trouble with income during the pandemic, mortgage debt did not increase and fewer people defaulted on their home loans.
This suggests that government programs or payment support policies helped people manage their mortgage payments during COVID-19.
#Percent Change in Mortgage Debt Balance per Capita (2017 → 2019)
qq1 <- ggplot(df, aes(x = "", y = mortgage_pre)) +
geom_boxplot(fill = "pink") +
labs(title = "Mortgage Debt per Capital -Pre", y = "Mortgage Dept Change Rate (%)")
# Percent Change in Mortgage Debt Balance per Capita (2020 → 2022)
qq2 <- ggplot(df, aes(x = "", y = mortgage_post)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Mortgage Debt per Capital - Post", y = "Mortgage Dept Change Rate (%)")
qq3 <- ggplot(df, aes(x = "", y = mortgage_delinq_pre)) +
geom_boxplot(fill = "orange") +
labs(title = "Mortgage Delinquency - Pre", y = "Delinquency Rate (%)")
qq4 <- ggplot(df, aes(x = "", y = mortgage_delinq_post)) +
geom_boxplot(fill = "purple") +
labs(title = "Mortgage Delinquency - Post", y = "Delinquency Rate (%)")
grid.arrange(qq1, qq2, qq3, qq4, ncol = 4)
| Variable Name | Formula | Description |
|---|---|---|
| studentloan_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Student loan debt growth rate (Pre-COVID) |
| studentloan_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Student loan debt growth rate (Post-COVID) |
| studentloan_delinq_pre | Q4_2019 | Student loan delinquency rate (Pre-COVID) |
| studentloan_delinq_post | Q4_2022 | Student loan delinquency rate (Post-COVID) |
This graph shows the percentage of people who were behind on their student loan payments before and after COVID.
The blue bars (pre) are very tall in many statess.
The red bars (post) are much shorter in many states.
This means that student loan delinquency dropped a lot after COVID. This supports the idea that government programs, like loan payment pauses or forgiveness plans, helped people avoid falling behind on their student loans.
df %>%
select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
ggplot() +
geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
, y = studentloan_delinq_pre
,fill = "Student Loan pre = Q4_2019")
, stat = "identity"
, position = position_nudge(x = -0.3)
, width = 0.4) +
geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
, y = studentloan_delinq_post
, fill = "Student Loan post = Q4_2022")
, stat = "identity"
, position = position_nudge(x = 0.3)
, width = 0.4) +
labs(title = "Student Loan Delinquency: Pre vs Post COVID", y = "Delinquency Rate (%)", x = "") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")
These two charts show how student loan debt and delinquency changed before and after COVID-19.
Left chart: The before COVID is taller and wider, meaning more states had higher increases in student debt.
The after COVID is lower and tighter, showing that student loan debt grew more slowly and in some cases even decreased.
Right chart: The before COVID shows high and variable delinquency rates.
The after COVID is very short and close to 0.
This strongly supports the idea that government loan pauses, forgiveness programs, or policy help students during and after the pandemic.
student_df <- df %>%
select(State, studentloan_pre, studentloan_post) %>%
pivot_longer(cols = c(studentloan_pre, studentloan_post), names_to = "Time", values_to = "Change") %>%
mutate(Time= ifelse(Time == "studentloan_pre", "Pre", "Post"))
#student_df
student_df$Time <- factor(student_df$Time, levels = c("Pre", "Post"))
gg_student <- ggplot(student_df, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.4) +
labs(title = "Student Loan Debt Change (%)", y = "Change Rate (%)", x = "") +
scale_fill_manual(values = c( "Pre" = "orange","Post" = "purple"))
#gg_student
#names(df)
student_df2 <- df %>%
select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
pivot_longer(cols = c(studentloan_delinq_pre, studentloan_delinq_post), names_to = "Time", values_to = "Change") %>%
mutate(Time= ifelse(Time == "studentloan_delinq_pre", "Pre", "Post"))
#student_df2
student_df2$Time <- factor(student_df2$Time, levels = c("Pre", "Post"))
gg_student2 <- ggplot(student_df2, aes(x = Time, y = Change, fill = Time)) +
geom_violin(trim = FALSE, alpha = 0.4) +
labs(title = "Student Loan Debt Deliq Change (%)", y = "Change Rate (%)", x = "") +
scale_fill_manual(values = c( "Pre" = "yellow", "Post" = "red"))
#gg_student2
grid.arrange(gg_student, gg_student2, ncol = 2)
| Variable Name | Formula | Description |
|---|---|---|
| auto_pre | ((Q4_2019 - Q4_2017) / Q4_2017) × 100 | Auto loan debt growth rate (Pre-COVID) |
| auto_post | ((Q4_2022 - Q4_2020) / Q4_2020) × 100 | Auto loan debt growth rate (Post-COVID) |
| auto_delinq_pre | Q4_2019 | Auto loan delinquency rate (Pre-COVID) |
| auto_delinq_post | Q4_2022 | Auto loan delinquency rate (Post-COVID) |
This chart shows how car loan amounts and delinquency rates changed before and after COVID-19.
Auto Loan (Two Left) The green box (post) is higher and wider than the red box (pre). This means people borrowed more for cars after the pandemic.
Auto Loan Delinquency (Two Right) The purple and blue boxes are similar, but the purple one (post) is slightly lower. This means that even though borrowing increased, people still paid their car loans on time.
df_auto <- df %>%
select(auto_pre, auto_post, auto_delinq_pre, auto_delinq_post) %>%
pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>%
mutate(Category = factor(Category, levels = c("auto_pre", "auto_post", "auto_delinq_pre", "auto_delinq_post")))
ggplot(df_auto, aes(x = Category, y = Value, fill = Category)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplots of Auto Loan and Delinquency (Pre vs Post)",
x = NULL,
y = "Value") +
theme(axis.text.x = element_text(angle = 20, hjust = 1), legend.position = "bottom")
| Variable Name | Formula | Description |
|---|---|---|
| zhvi_post_change | ((ZHVI_2022 - ZHVI_2020) / ZHVI_2020) × 100 | Housing price change rate (Post-COVID) |
| zhvi_post_avg | (ZHVI_2020 + ZHVI_2021 + ZHVI_2022) / 3 | Average housing price during post-COVID years |
| zhvi_pre_change | ((ZHVI_2019 - ZHVI_2017) / ZHVI_2017) × 100 | Housing price change rate (Pre-COVID) |
| zhvi_pre_avg | (ZHVI_2017 + ZHVI_2018 + ZHVI_2019) / 3 | Average housing price during pre-COVID years |
merged_dfLC <- merged_df %>%
rename(state = State)
## heat map for PRE covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_pre_change", regions = "state") +
scale_fill_continuous(
name = "Zillow Home Value Change (%)",
low = "lightyellow", high = "red", label = scales::comma
) +
labs(title = "Pre-COVID Housing Price Change (2017–2019)") +
theme(legend.position = "right")
## heat map for POST covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_post_change", regions = "state") +
scale_fill_continuous(
name = "Zillow Home Value Change (%)",
low = "lightblue", high = "darkblue", label = scales::comma
) +
labs(title = "Post-COVID Housing Price Change (2020–2022)") +
theme(legend.position = "right")
### both pre and post, side by side
housing_long <- merged_dfLC %>%
select(state, zhvi_pre_change, zhvi_post_change) %>%
pivot_longer(
cols = c(zhvi_pre_change, zhvi_post_change),
names_to = "period",
values_to = "change"
) %>%
mutate(
period = dplyr::recode(period,
"zhvi_pre_change" = "Pre-COVID (2017–2019)",
"zhvi_post_change" = "Post-COVID (2020–2022)")) %>%
mutate(period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)")))
plot_usmap(data = housing_long, values = "change", regions = "state") +
facet_wrap(~ period) +
scale_fill_continuous(
name = "ZHVI Change (%)",
low = "lightyellow", high = "red", label = scales::comma
) +
labs(title = "Housing Price Change: Pre vs. Post COVID") +
theme(legend.position = "right")
housing_summary <- clean_housing %>%
select(State, zhvi_pre_change, zhvi_post_change)
# Run paired t-test
t_test_result <- t.test(housing_summary$zhvi_post_change, housing_summary$zhvi_pre_change,
paired = TRUE, alternative = "two.sided")
shapiro.test(housing_summary$zhvi_pre_change - housing_summary$zhvi_post_change )
##
## Shapiro-Wilk normality test
##
## data: housing_summary$zhvi_pre_change - housing_summary$zhvi_post_change
## W = 0.9867, p-value = 0.8337
t_test_result
##
## Paired t-test
##
## data: housing_summary$zhvi_post_change and housing_summary$zhvi_pre_change
## t = 18.127, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 16.65046 20.80028
## sample estimates:
## mean difference
## 18.72537
mortgage_summary <- merged_df %>%
select(State, mortgage_pre, mortgage_post)
# Run paired t-test
t_test_result2 <- t.test(merged_df$mortgage_post,merged_df$mortgage_pre,
paired = TRUE, alternative = "two.sided")
shapiro.test(merged_df$mortgage_post - merged_df$mortgage_pre)
##
## Shapiro-Wilk normality test
##
## data: merged_df$mortgage_post - merged_df$mortgage_pre
## W = 0.97102, p-value = 0.2437
t_test_result2
##
## Paired t-test
##
## data: merged_df$mortgage_post and merged_df$mortgage_pre
## t = 25.871, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 8.822871 10.308125
## sample estimates:
## mean difference
## 9.565498
income_summary <- merged_df %>%
select(State, income_pre, income_post)
# Run paired t-test
t_test_result3 <- t.test(merged_df$income_post,merged_df$income_pre,
paired = TRUE, alternative = "two.sided")
shapiro.test(merged_df$income_post - merged_df$income_pre)
##
## Shapiro-Wilk normality test
##
## data: merged_df$income_post - merged_df$income_pre
## W = 0.96408, p-value = 0.1245
t_test_result3
##
## Paired t-test
##
## data: merged_df$income_post and merged_df$income_pre
## t = 10.794, df = 50, p-value = 1.156e-14
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 3.336458 4.862107
## sample estimates:
## mean difference
## 4.099282
# Correlation between housing prices pre and post
cor_zhvi <- cor.test(merged_df$zhvi_pre_avg, merged_df$zhvi_post_avg, method = "pearson")
cor_zhvi
##
## Pearson's product-moment correlation
##
## data: merged_df$zhvi_pre_avg and merged_df$zhvi_post_avg
## t = 36.948, df = 49, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9694276 0.9900366
## sample estimates:
## cor
## 0.9825222
ggplot(merged_df, aes(x = zhvi_pre_avg, y = zhvi_post_avg)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Housing Prices",
subtitle = "Strong positive correlation between ZHVI pre- and post-pandemic",
x = "Pre-COVID Average Housing Prices (ZHVI)",
y = "Post-COVID Average Housing Prices (ZHVI)"
) +
theme_minimal()
cor_mort <- cor.test(merged_df$mortgage_pre, merged_df$mortgage_post, method = "pearson")
cor_mort
##
## Pearson's product-moment correlation
##
## data: merged_df$mortgage_pre and merged_df$mortgage_post
## t = 5.7317, df = 49, p-value = 6.045e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4336644 0.7739861
## sample estimates:
## cor
## 0.6335339
ggplot(merged_df, aes(x = mortgage_pre, y = mortgage_post)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Mortgage",
x = "Pre-COVID Average Mortgage",
y = "Post-COVID Average Mortgage"
) +
theme_minimal()
cor_income <- cor.test(merged_df$income_pre, merged_df$income_post, method = "pearson")
cor_income
##
## Pearson's product-moment correlation
##
## data: merged_df$income_pre and merged_df$income_post
## t = 2.3517, df = 49, p-value = 0.02275
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04700942 0.54612103
## sample estimates:
## cor
## 0.3184673
ggplot(merged_df, aes(x = income_pre, y = income_post)) +
geom_point(color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Correlation Between Pre and Post COVID Income",
x = "Pre-COVID Average Income",
y = "Post-COVID Average Income"
) +
theme_minimal()
Before COVID, housing prices were mainly influenced by population and mortgage. After COVID, broader economic factors such as GDP, total debt, and credit card borrowing had a stronger impact.
Most were not statistically significant
Adjusted R²: 0.36
Not strong individually, but whole model was statistically valid (p = 0.0036)
# Multiple linear regression model
pre_df <- merged_df %>%
select(contains("pre")) %>%
select(-zhvi_pre_change, -zhvi_pre_avg)
pre_names <- names(pre_df)
formula_full <- as.formula(paste("zhvi_pre_change ~", paste(pre_names, collapse = " + ")))
model_full1 <- lm(formula_full, data = merged_df)
summary(model_full1)
##
## Call:
## lm(formula = formula_full, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4368 -1.3000 0.2075 1.6407 6.1057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.34676 5.65933 0.415 0.681
## gdp_pre 0.21443 0.35859 0.598 0.554
## income_pre -0.33060 0.49996 -0.661 0.513
## pce_pre 0.70535 0.62303 1.132 0.265
## population_pre 0.68661 0.75893 0.905 0.372
## creditcard_pre -0.22734 0.42108 -0.540 0.593
## auto_pre 0.04388 0.27742 0.158 0.875
## mortgage_pre 0.22029 1.40761 0.156 0.877
## studentloan_pre -0.05087 0.21592 -0.236 0.815
## total_pre 0.09585 1.92966 0.050 0.961
## creditcard_delinq_pre 0.64032 0.67576 0.948 0.350
## auto_delinq_pre -0.39783 0.56898 -0.699 0.489
## mortgage_delinq_pre -2.62600 1.77634 -1.478 0.148
## studentloan_delinq_pre 0.28630 0.43631 0.656 0.516
## unemp_pre 0.13806 1.35077 0.102 0.919
##
## Residual standard error: 3.514 on 36 degrees of freedom
## Multiple R-squared: 0.5417, Adjusted R-squared: 0.3634
## F-statistic: 3.039 on 14 and 36 DF, p-value: 0.003614
Used Step function to find best model.
This chunk is showing process of step function
model_step1 <- step(model_full1, direction = "both", trace = TRUE)
Population (ignificant): More people → Higher house price change
Mortgage delinquency (ignificant): More mortgage trouble → Lower prices
Student loan delinquency: Slight positive effect, but not clear
Spending (PCE): May increase prices, but not strongly proven
Before COVID, housing prices were influenced more by people and debt especially mortgage.
summary(model_step1)
##
## Call:
## lm(formula = zhvi_pre_change ~ pce_pre + population_pre + mortgage_delinq_pre +
## studentloan_delinq_pre, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4072 -1.8814 -0.2808 1.6847 6.8405
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3339 3.9452 0.592 0.5570
## pce_pre 0.6114 0.4104 1.490 0.1432
## population_pre 1.2480 0.5552 2.248 0.0294 *
## mortgage_delinq_pre -2.7651 1.2618 -2.191 0.0335 *
## studentloan_delinq_pre 0.3466 0.1974 1.755 0.0858 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.243 on 46 degrees of freedom
## Multiple R-squared: 0.5012, Adjusted R-squared: 0.4578
## F-statistic: 11.55 on 4 and 46 DF, p-value: 1.415e-06
Adjusted R²: 0.78 and model is very strong (p < 0.00001).
There are some statistically valid values.
# Multiple linear regression model
# Multiple linear regression model for POST-COVID variables
post_df <- merged_df %>%
select(contains("post")) %>%
select(-zhvi_post_change, -zhvi_post_avg)
post_names <- names(post_df)
formula_full_post <- as.formula(paste("zhvi_post_change ~", paste(post_names, collapse = " + ")))
model_full2 <- lm(formula_full_post, data = merged_df)
summary(model_full2)
##
## Call:
## lm(formula = formula_full_post, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4295 -3.0744 -0.6837 2.6750 13.2850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.51913 12.27869 2.078 0.04487 *
## gdp_post 0.98785 0.41337 2.390 0.02222 *
## income_post -1.12542 0.38509 -2.922 0.00597 **
## pce_post 0.88047 0.56539 1.557 0.12815
## population_post 2.39598 1.07403 2.231 0.03201 *
## creditcard_post -1.54762 0.51419 -3.010 0.00475 **
## auto_post -0.21758 0.35934 -0.605 0.54865
## mortgage_post -0.68766 1.42590 -0.482 0.63254
## studentloan_post 0.01944 0.29950 0.065 0.94861
## total_post 1.69262 1.99665 0.848 0.40219
## creditcard_delinq_post 0.54633 0.94697 0.577 0.56758
## auto_delinq_post -0.21689 0.70203 -0.309 0.75915
## mortgage_delinq_post 0.91165 5.39651 0.169 0.86679
## studentloan_delinq_post -1.61726 2.04614 -0.790 0.43447
## unemp_post 0.72238 0.86203 0.838 0.40756
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.527 on 36 degrees of freedom
## Multiple R-squared: 0.8428, Adjusted R-squared: 0.7816
## F-statistic: 13.78 on 14 and 36 DF, p-value: 1.763e-10
Used Step function to find best model.
This chunk is showing process of step function
model_step2 <- step(model_full2, direction = "both", trace = TRUE)
GDP (Significant): More GDP → Higher prices
Income : More income → Slightly lower price change
Spending (PCE): Slight positive effect
Population (very strong): More people → Much higher prices
Credit card debt (very strong): More credit card debt → Lower price change
Total debt: More debt → Higher prices
After COVID, housing prices were affected by macro economy and spending patterns. Especially credit card debt had a strong negative impact.
summary(model_step2)
##
## Call:
## lm(formula = zhvi_post_change ~ gdp_post + income_post + pce_post +
## population_post + creditcard_post + total_post, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2902 -2.5636 0.0264 2.4206 13.6180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.0975 8.3240 2.895 0.005882 **
## gdp_post 0.9422 0.2985 3.156 0.002881 **
## income_post -0.8729 0.2754 -3.169 0.002780 **
## pce_post 0.7138 0.4119 1.733 0.090074 .
## population_post 2.9476 0.7787 3.785 0.000461 ***
## creditcard_post -1.5189 0.3283 -4.627 3.27e-05 ***
## total_post 0.8062 0.3817 2.112 0.040394 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.186 on 44 degrees of freedom
## Multiple R-squared: 0.8357, Adjusted R-squared: 0.8133
## F-statistic: 37.3 on 6 and 44 DF, p-value: 1.086e-15
df_stat <- read_csv("data/data-clean/FINAL.csv")
# df_stat
| Variable | ANOVA p-value | Post-Hoc Interpretation | Shapiro p-value | Levene p-value |
|---|---|---|---|---|
| gdp_post | 2.40e-08 | All three groups significantly different from each other | 0.798 | 0.273 |
| income_post | 0.039 | Low group < Medium & High groups (no difference between Medium & High) | 0.299 | 0.344 |
| unemp_post | 0.122 | No statistically significant differences between groups | 0.679 | 0.605 |
| total_post | 0.00016 | Differences exist; Post-hoc details required (likely Low vs High) | 0.401 | 0.138 |
| creditcard_post | 0.333 | No statistically significant differences between groups | 0.873 | 0.188 |
df_stat$pop_group <- cut(df_stat$population_post,
breaks = quantile(df_stat$population_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Small", "Medium", "Large"),
include.lowest = TRUE)
pop_anova <- aov(zhvi_post_change ~ pop_group, data = df_stat)
summary(pop_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## pop_group 2 2412 1206.2 25.39 3.01e-08 ***
## Residuals 48 2280 47.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(pop_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $pop_group
## diff lwr.ci upr.ci pval
## Medium-Small 4.031883 -0.7216142 8.78538 0.0946 .
## Large-Small 16.181339 11.4278415 20.93484 1.3e-08 ***
## Large-Medium 12.149456 7.3959584 16.90295 5.0e-06 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = pop_group, y = zhvi_post_change, fill = pop_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Post-COVID Housing Price Change by Population Group",
x = "Population Group",
y = "zhvi_post_change")
# residual analysis to check the assumptions
shapiro.test(pop_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: pop_anova$residuals
## W = 0.96771, p-value = 0.1774
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(pop_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.0936 0.3432
## 48
pop_group_table <- df_stat %>%
select(State, population_post, zhvi_post_change, pop_group)
datatable(pop_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Population Group (Small / Medium / Large)")
df_stat$income_group <- cut(df_stat$income_post,
breaks = quantile(df_stat$income_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
income_anova <- aov(zhvi_post_change ~ income_group, data = df_stat)
summary(income_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## income_group 2 593 296.7 3.474 0.039 *
## Residuals 48 4099 85.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(income_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $income_group
## diff lwr.ci upr.ci pval
## Medium-Low 7.6532966 1.2800167 14.026577 0.0196 *
## High-Low 6.7293284 0.3560485 13.102608 0.0389 *
## High-Medium -0.9239682 -7.2972481 5.449312 0.7719
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = income_group, y = zhvi_post_change, fill = income_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Income Group",
x = "Income Group",
y = "zhvi_post_change")
# residual analysis to check the assumptions
shapiro.test(income_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: income_anova$residuals
## W = 0.98935, p-value = 0.9254
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(income_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.0908 0.3441
## 48
income_group_table <- df_stat %>%
select(State, income_post, zhvi_post_change, income_group)
datatable(income_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Income Group (Low / Medium / High)")
df_stat$gdp_group <- cut(df_stat$gdp_post,
breaks = quantile(df_stat$gdp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
gdp_anova <- aov(zhvi_post_change ~ gdp_group, data = df_stat)
summary(gdp_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## gdp_group 2 2434 1216.9 25.86 2.4e-08 ***
## Residuals 48 2259 47.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(gdp_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $gdp_group
## diff lwr.ci upr.ci pval
## Medium-Low 5.17528 0.444203 9.906358 0.0327 *
## High-Low 16.53958 11.808498 21.270653 6.6e-09 ***
## High-Medium 11.36430 6.633218 16.095372 1.4e-05 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(gdp_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: gdp_anova$residuals
## W = 0.98001, p-value = 0.5392
leveneTest(zhvi_post_change ~ factor(gdp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.3341 0.273
## 48
ggplot(df_stat, aes(x = gdp_group, y = zhvi_post_change, fill = gdp_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by GDP Group",
x = "GDP Group",
y = "zhvi_post_change")
gdp_group_table <- df_stat %>%
select(State, gdp_post, zhvi_post_change, gdp_group)
datatable(gdp_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by GDP Group (Low / Medium / High)")
df_stat$unemp_group <- cut(df_stat$unemp_post,
breaks = quantile(df_stat$unemp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
unemp_anova <- aov(zhvi_post_change ~ unemp_group, data = df_stat)
summary(unemp_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## unemp_group 2 394 197.17 2.202 0.122
## Residuals 48 4298 89.55
PostHocTest(unemp_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $unemp_group
## diff lwr.ci upr.ci pval
## Medium-Low 4.306174 -2.1287038 10.741051 0.1848
## High-Low 6.813317 0.1859737 13.440661 0.0441 *
## High-Medium 2.507144 -4.0303336 9.044621 0.4444
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(unemp_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: unemp_anova$residuals
## W = 0.97346, p-value = 0.3061
leveneTest(zhvi_post_change ~ factor(unemp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.5078 0.605
## 48
ggplot(df_stat, aes(x = unemp_group, y = zhvi_post_change, fill = unemp_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Unemployment Group",
x = "Unemployment Group",
y = "zhvi_post_change")
unemp_group_table <- df_stat %>%
select(State, unemp_post, zhvi_post_change, unemp_group)
datatable(unemp_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Unemployment Group (Low / Medium / High)")
df_stat$total_group <- cut(df_stat$total_post,
breaks = quantile(df_stat$total_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
total_anova <- aov(zhvi_post_change ~ total_group, data = df_stat)
summary(total_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## total_group 2 1435 717.6 10.57 0.000157 ***
## Residuals 48 3258 67.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(total_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $total_group
## diff lwr.ci upr.ci pval
## Medium-Low -0.1750897 -5.856411 5.506232 0.95085
## High-Low 11.1646443 5.483323 16.845966 0.00025 ***
## High-Medium 11.3397340 5.658412 17.021056 0.00021 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(total_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: total_anova$residuals
## W = 0.94454, p-value = 0.01876
leveneTest(zhvi_post_change ~ factor(total_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.0672 0.1377
## 48
ggplot(df_stat, aes(x = total_group, y = zhvi_post_change, fill = total_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Total Debt Group",
x = "Total Debt Group",
y = "zhvi_post_change")
total_group_table <- df_stat %>%
select(State, total_post, zhvi_post_change, total_group)
datatable(total_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Total Debt Group (Low / Medium / High)")
df_stat$creditcard_group <- cut(df_stat$creditcard_post,
breaks = quantile(df_stat$creditcard_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE)
creditcard_anova <- aov(zhvi_post_change ~ creditcard_group, data = df_stat)
summary(creditcard_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## creditcard_group 2 210 104.99 1.124 0.333
## Residuals 48 4483 93.39
PostHocTest(creditcard_anova, method = "lsd")
##
## Posthoc multiple comparisons of means : Fisher LSD
## 95% family-wise confidence level
##
## $creditcard_group
## diff lwr.ci upr.ci pval
## Medium-Low 4.930728 -1.733912 11.595367 0.1434
## High-Low 3.007060 -3.657579 9.671699 0.3688
## High-Medium -1.923668 -8.588307 4.740972 0.5644
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(creditcard_anova$residuals)
##
## Shapiro-Wilk normality test
##
## data: creditcard_anova$residuals
## W = 0.98949, p-value = 0.9295
leveneTest(zhvi_post_change ~ factor(creditcard_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 1.7293 0.1883
## 48
ggplot(df_stat, aes(x = creditcard_group, y = zhvi_post_change, fill = creditcard_group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Housing Price Change by Credit Card Debt Group",
x = "Credit Card Debt Group",
y = "zhvi_post_change")
credit_group_table <- df_stat %>%
select(State, creditcard_post, zhvi_post_change, creditcard_group)
datatable(credit_group_table,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "States by Credit Card Debt Group (Low / Medium / High)")
tree_model <- rpart(zhvi_post_change ~ zhvi_post_avg + population_post + gdp_post + income_post + unemp_post + total_post + creditcard_post
,data = df_stat
,method = "anova"
,control = rpart.control(cp = 0.0001,minsplit = 10, minbucket = 4)) # it's not class because our house price is
rpart.plot(tree_model,
extra = 101,
fallen.leaves = TRUE,
main = "Decision Tree")
tree_model$variable.importance
## gdp_post population_post total_post creditcard_post income_post
## 2899.6427 2486.3110 1887.3291 836.0229 485.0208
## zhvi_post_avg unemp_post
## 309.8245 182.1748
node_assignments <- tree_model$where
tree_nodes <- data.frame(State = df_stat$State, Node = node_assignments)
#tree_nodes
datatable(tree_nodes,
filter = "top",
caption = "state")
clusters <- c("population_post", "gdp_post", "income_post", "unemp_post", "total_post", "zhvi_post_avg")
cluster_z <- scale(df_stat[, clusters] )
set.seed(612)
kmeans <- kmeans(cluster_z, centers = 5)
#kmeans
df_stat$cluster <- as.factor(kmeans$cluster)
ggplot(df_stat, aes(x = cluster, y = zhvi_post_change, fill = cluster)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Cluster for Post COVID",
x = "Cluster Group",
y = "zhvi_post_change")
cluster_nodes <- data.frame(State = df_stat$State, Cluster = df_stat$cluster)
#cluster_nodes
split(cluster_nodes$State, cluster_nodes$Cluster)
## $`1`
## [1] "CA" "DC" "HI" "MA"
##
## $`2`
## [1] "AL" "GA" "IN" "KS" "KY" "ME" "MN" "MO" "NH" "OR" "SC" "VT" "VA" "WI"
##
## $`3`
## [1] "AK" "CT" "IL" "LA" "MD" "MI" "MS" "NJ" "NM" "NY" "OH" "PA" "RI" "WV"
##
## $`4`
## [1] "AZ" "CO" "FL" "ID" "MT" "NV" "NC" "TN" "TX" "UT" "WA"
##
## $`5`
## [1] "AR" "DE" "IA" "NE" "ND" "OK" "SD" "WY"
cluster_summary <- aggregate(df_stat[, clusters], by = list(Cluster = df_stat$cluster), FUN = mean)
cluster_summary[ , -1] <- round(cluster_summary[ , -1], 2)
print(cluster_summary)
## Cluster population_post gdp_post income_post unemp_post total_post
## 1 1 1.25 7.76 10.07 -4.65 13.62
## 2 2 2.24 8.05 12.35 -1.89 11.48
## 3 3 1.19 6.02 10.49 -3.55 11.12
## 4 4 4.13 11.76 14.58 -2.51 16.86
## 5 5 2.80 5.51 16.95 -1.64 11.73
## zhvi_post_avg
## 1 628886.4
## 2 265625.4
## 3 266754.1
## 4 366580.3
## 5 229319.0
clusters2 <- c("population_post", "income_post", "total_post", "zhvi_post_avg")
cluster_z2 <- scale(df_stat[, clusters2])
set.seed(612)
kmeans2 <- kmeans(cluster_z2, centers = 6)
df_stat$cluster2 <- as.factor(kmeans2$cluster)
ggplot(df_stat, aes(x = cluster2, y = zhvi_post_change, fill = cluster2)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Clustering: GDP + Income + Population",
x = "Cluster",
y = "Housing Price Change (%)")
cluster_nodes2 <- data.frame(State = df_stat$State, Cluster = df_stat$cluster2)
split(cluster_nodes2$State, cluster_nodes2$Cluster)
## $`1`
## [1] "AR" "IA" "NE" "ND" "OK" "SD" "WY"
##
## $`2`
## [1] "AZ" "DE" "FL" "GA" "MT" "NV" "NC" "SC" "TN" "TX"
##
## $`3`
## [1] "CT" "MD" "MA" "NH" "NJ" "NY" "PA" "RI" "VT"
##
## $`4`
## [1] "CO" "ID" "UT"
##
## $`5`
## [1] "CA" "DC" "HI" "OR" "WA"
##
## $`6`
## [1] "AL" "AK" "IL" "IN" "KS" "KY" "LA" "ME" "MI" "MN" "MS" "MO" "NM" "OH" "VA"
## [16] "WV" "WI"
cluster_summary2 <- aggregate(df_stat[, clusters2], by = list(Cluster = df_stat$cluster2), FUN = mean)
cluster_summary2[ , -1] <- round(cluster_summary2[ , -1], 2)
print(cluster_summary2)
## Cluster population_post income_post total_post zhvi_post_avg
## 1 1 2.51 17.21 12.12 216543.8
## 2 2 4.14 13.60 14.35 299750.4
## 3 3 1.91 9.70 10.57 359597.0
## 4 4 4.54 16.79 19.66 446324.5
## 5 5 1.61 10.90 15.33 593770.3
## 6 6 1.35 12.19 11.23 222252.1